{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5"
   },
   "outputs": [],
   "source": [
    "import numpy as np # linear algebra\n",
    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
    "import os\n",
    "print(os.listdir(\"../input\"))\n",
    "\n",
    "# Any results you write to the current directory are saved as output.\n",
    "\n",
    "import spacy\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "import re\n",
    "from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator\n",
    "\n",
    "\n",
    "#Visualisation\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from plotly import tools\n",
    "import plotly.offline as py\n",
    "py.init_notebook_mode(connected=True)\n",
    "import plotly.graph_objs as go\n",
    "\n",
    "import tensorflow as tf\n",
    "tf.logging.set_verbosity(tf.logging.INFO)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = pd.read_excel('../input/mh-newspred/Data_Train.xlsx')\n",
    "df_test = pd.read_excel('../input/mh-newspred/Data_Test.xlsx')\n",
    "df_sample = pd.read_excel('../input/mh-newspred/Data_Test.xlsx')\n",
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train.rename(columns={'SECTION':'target','STORY':'comment_text'},inplace=True)\n",
    "df_test.rename(columns={'STORY':'comment_text'},inplace=True)\n",
    "df_train = train_df[['target','comment_text']]\n",
    "\n",
    "df_test = df_test[['comment_text']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_test.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train['comment_text'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lengths = df_train.comment_text.str.len()\n",
    "lengths.mean(), lengths.std(), lengths.min(), lengths.max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lengths = df_test.comment_text.str.len()\n",
    "lengths.mean(), lengths.std(), lengths.min(), lengths.max()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Preprocess and create TSV files to perform XLNet classification**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess_reviews(text):\n",
    "    text = re.sub(r'<[^>]*>', ' ', text, re.UNICODE)\n",
    "    text = re.sub(r'[^\\w\\s]', '', text, re.UNICODE)\n",
    "    text = re.sub(r'[^0-9a-zA-Z]+',' ',text, re.UNICODE)\n",
    "    text = \" \".join(text.split())\n",
    "    text = text.lower()\n",
    "    return text\n",
    "\n",
    "df_train['comment_text'] = df_train.comment_text.apply(lambda x: preprocess_reviews(x))\n",
    "df_test['comment_text'] = df_test.comment_text.apply(lambda x: preprocess_reviews(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# force train into cola format, test is fine as it is\n",
    "df_train = df_train[['target', 'comment_text']]\n",
    "df_train['target'] = np.where(df_train['target']>=0.5,1,0)\n",
    "\n",
    "#Sampling 30% to save training time\n",
    "df_train = df_train.sample(frac=0.3)\n",
    "\n",
    "# export as tab seperated\n",
    "df_train.to_csv('train.tsv', sep='\\t', index=False, header=False)\n",
    "df_test.to_csv('test.tsv', sep='\\t', index=False, header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train.shape, df_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Let's copy the XLNet files from git repo to working folder for easy reference**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import module we'll need to import our custom module\n",
    "from shutil import copyfile\n",
    "\n",
    "# copy our file into the working directory (make sure it has .py suffix)\n",
    "for f in os.listdir('../input/xlnetcode/'):\n",
    "    try:\n",
    "        if f.split('.')[1] in ['py', 'json']:\n",
    "            copyfile(src = \"../input/xlnetcode/\"+f, dst = \"../working/\"+f)\n",
    "    except:\n",
    "        continue\n",
    "print(os.listdir('../working'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from absl import flags\n",
    "import xlnet\n",
    "from run_classifier import *\n",
    "import sys"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Performing this step to initialise FLAGS in IPython Notebook**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "remaining_args = FLAGS([sys.argv[0]] + [flag for flag in sys.argv if flag.startswith(\"--\")])\n",
    "assert(remaining_args == [sys.argv[0]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "FLAGS.spiece_model_file = '../input/xlnetcode/spiece.model'\n",
    "FLAGS.model_config_path = '../input/xlnetcode/xlnet_config.json'\n",
    "FLAGS.output_dir =\"../\"\n",
    "FLAGS.model_dir = \"../\"\n",
    "FLAGS.data_dir = \"../working/\"\n",
    "FLAGS.do_train = False\n",
    "FLAGS.train_steps = 1000\n",
    "FLAGS.warmup_steps = 0\n",
    "FLAGS.learning_rate = 1e-5\n",
    "FLAGS.max_save = 999999\n",
    "FLAGS.use_tpu = False\n",
    "\n",
    "#Used not take any of the processors and get from the tasks\n",
    "FLAGS.cls_scope = True"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using appropriate XLNet implementation from here\n",
    "**SentencePiece Tokenizer implementation**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Tokenization\n",
    "import sentencepiece as spm\n",
    "from prepro_utils import preprocess_text, encode_ids\n",
    "\n",
    "sp = spm.SentencePieceProcessor()\n",
    "sp.Load(FLAGS.spiece_model_file)\n",
    "def tokenize_fn(text):\n",
    "    text = preprocess_text(text, lower=FLAGS.uncased)\n",
    "    return encode_ids(sp, text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Initialise GLUEProcessor and specify the column indexes in test and train datasets and create examples**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "processor = GLUEProcessor()\n",
    "label_list = processor.get_labels()\n",
    "processor.label_column = 0\n",
    "processor.text_a_column = 1\n",
    "processor.test_text_a_column = 0\n",
    "train_examples = processor.get_train_examples(FLAGS.data_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_examples[0].label, train_examples[0].text_a, train_examples[0].text_b "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "start = time.time()\n",
    "print(\"--------------------------------------------------------\")\n",
    "print(\"Starting to Train\")\n",
    "print(\"--------------------------------------------------------\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_file = os.path.join(FLAGS.output_dir, \"train.tf_record\")\n",
    "tf.logging.info(\"Use tfrecord file {}\".format(train_file))\n",
    "np.random.shuffle(train_examples)\n",
    "tf.logging.info(\"Num of train samples: {}\".format(len(train_examples)))\n",
    "file_based_convert_examples_to_features(\n",
    "        train_examples, label_list, FLAGS.max_seq_length, tokenize_fn,\n",
    "        train_file, FLAGS.num_passes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# RunConfig contains hyperparameters that could be different between pretraining and finetuning.\n",
    "tpu_cluster_resolver = None\n",
    "is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2\n",
    "\n",
    "run_config = tf.contrib.tpu.RunConfig(\n",
    "    cluster=tpu_cluster_resolver,\n",
    "    master=FLAGS.master,\n",
    "    model_dir=FLAGS.output_dir,\n",
    "    save_checkpoints_steps=FLAGS.save_steps,\n",
    "    tpu_config=tf.contrib.tpu.TPUConfig(\n",
    "        iterations_per_loop=FLAGS.iterations,\n",
    "        num_shards=FLAGS.num_core_per_host,\n",
    "        per_host_input_for_training=is_per_host))\n",
    "model_fn = get_model_fn(len(label_list) if label_list is not None else None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tf.logging.set_verbosity(tf.logging.INFO)\n",
    "estimator = tf.contrib.tpu.TPUEstimator(\n",
    "        use_tpu=FLAGS.use_tpu,\n",
    "        model_fn=model_fn,\n",
    "        config=run_config,\n",
    "        train_batch_size=FLAGS.train_batch_size,\n",
    "        predict_batch_size=FLAGS.predict_batch_size,\n",
    "        eval_batch_size=FLAGS.eval_batch_size)\n",
    "\n",
    "tf.logging.info(\"***** Running training *****\")\n",
    "tf.logging.info(\"  Num examples = %d\", len(train_examples))\n",
    "tf.logging.info(\"  Batch size = %d\", FLAGS.train_batch_size)\n",
    "tf.logging.info(\"  Num steps = %d\", FLAGS.iterations)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_input_fn = file_based_input_fn_builder(\n",
    "        input_file=train_file,\n",
    "        seq_length=FLAGS.max_seq_length,\n",
    "        is_training=True,\n",
    "        drop_remainder=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "end = time.time()\n",
    "print(\"--------------------------------------------------------\")\n",
    "print(\"Total time taken to complete training - \", end - start, \" seconds\")\n",
    "print(\"--------------------------------------------------------\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_examples = processor.get_test_examples(FLAGS.data_dir)\n",
    "tf.logging.info(\"Num of test samples: {}\".format(len(test_examples)))\n",
    "eval_file = os.path.join(FLAGS.output_dir, \"predict.tf_record\")\n",
    "file_based_convert_examples_to_features(\n",
    "        test_examples, label_list, FLAGS.max_seq_length, tokenize_fn,\n",
    "        eval_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.path.getsize('../predict.tf_record')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pred_input_fn = file_based_input_fn_builder(\n",
    "        input_file=eval_file,\n",
    "        seq_length=FLAGS.max_seq_length,\n",
    "        is_training=False,\n",
    "        drop_remainder=False)\n",
    "predict_results = []\n",
    "with tf.gfile.Open(\"test_results.tsv\", \"w\") as fout:\n",
    "    fout.write(\"index\\tprediction\\n\")\n",
    "\n",
    "    for pred_cnt, result in enumerate(estimator.predict(\n",
    "        input_fn=pred_input_fn,\n",
    "        yield_single_examples=True)):\n",
    "        if pred_cnt % 1000 == 0:\n",
    "            tf.logging.info(\"Predicting submission for example: {}\".format(\n",
    "              pred_cnt))\n",
    "\n",
    "        logits = [float(x) for x in result[\"logits\"].flat]\n",
    "        predict_results.append(logits)\n",
    "\n",
    "        if len(logits) == 1:\n",
    "            label_out = logits[0]\n",
    "        elif len(logits) == 2:\n",
    "            if logits[1] - logits[0] > FLAGS.predict_threshold:\n",
    "                label_out = label_list[1]\n",
    "            else:\n",
    "                label_out = label_list[0]\n",
    "        elif len(logits) > 2:\n",
    "            max_index = np.argmax(np.array(logits, dtype=np.float32))\n",
    "            label_out = label_list[max_index]\n",
    "        else:\n",
    "            raise NotImplementedError\n",
    "\n",
    "        fout.write(\"{}\\t{}\\n\".format(pred_cnt, label_out))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(test_examples), len(predict_results)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Creating submission file**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_test_out = pd.read_csv('test_results.tsv', sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission = df_test_out.iloc[:,1]\n",
    "submission.columns = ['SECTION']\n",
    "submission['SECTION'].to_excel('xlnet.xls')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
